# Python libraries
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np
# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
from sklearn import *
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
#Loading the dataset
path = 'dataset/survey lung cancer.csv'
data = pd.read_csv(path)
#Print the first 5 rows of the dataframe.
data.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| GENDER | M | M | F | M | F |
| AGE | 69 | 74 | 59 | 63 | 63 |
| SMOKING | 1 | 2 | 1 | 2 | 1 |
| YELLOW_FINGERS | 2 | 1 | 1 | 2 | 2 |
| ANXIETY | 2 | 1 | 1 | 2 | 1 |
| PEER_PRESSURE | 1 | 1 | 2 | 1 | 1 |
| CHRONIC DISEASE | 1 | 2 | 1 | 1 | 1 |
| FATIGUE | 2 | 2 | 2 | 1 | 1 |
| ALLERGY | 1 | 2 | 1 | 1 | 1 |
| WHEEZING | 2 | 1 | 2 | 1 | 2 |
| ALCOHOL CONSUMING | 2 | 1 | 1 | 2 | 1 |
| COUGHING | 2 | 1 | 2 | 1 | 2 |
| SHORTNESS OF BREATH | 2 | 2 | 2 | 1 | 2 |
| SWALLOWING DIFFICULTY | 2 | 2 | 1 | 2 | 1 |
| CHEST PAIN | 2 | 2 | 2 | 2 | 1 |
| LUNG_CANCER | YES | YES | NO | NO | NO |
data.tail().T
| 304 | 305 | 306 | 307 | 308 | |
|---|---|---|---|---|---|
| GENDER | F | M | M | M | M |
| AGE | 56 | 70 | 58 | 67 | 62 |
| SMOKING | 1 | 2 | 2 | 2 | 1 |
| YELLOW_FINGERS | 1 | 1 | 1 | 1 | 1 |
| ANXIETY | 1 | 1 | 1 | 2 | 1 |
| PEER_PRESSURE | 2 | 1 | 1 | 1 | 2 |
| CHRONIC DISEASE | 2 | 1 | 1 | 1 | 1 |
| FATIGUE | 2 | 2 | 1 | 2 | 2 |
| ALLERGY | 1 | 2 | 2 | 2 | 2 |
| WHEEZING | 1 | 2 | 2 | 1 | 2 |
| ALCOHOL CONSUMING | 2 | 2 | 2 | 2 | 2 |
| COUGHING | 2 | 2 | 2 | 2 | 1 |
| SHORTNESS OF BREATH | 2 | 2 | 1 | 2 | 1 |
| SWALLOWING DIFFICULTY | 2 | 1 | 1 | 1 | 2 |
| CHEST PAIN | 1 | 2 | 2 | 2 | 1 |
| LUNG_CANCER | YES | YES | YES | YES | YES |
data.shape
(309, 16)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 309 entries, 0 to 308 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 GENDER 309 non-null object 1 AGE 309 non-null int64 2 SMOKING 309 non-null int64 3 YELLOW_FINGERS 309 non-null int64 4 ANXIETY 309 non-null int64 5 PEER_PRESSURE 309 non-null int64 6 CHRONIC DISEASE 309 non-null int64 7 FATIGUE 309 non-null int64 8 ALLERGY 309 non-null int64 9 WHEEZING 309 non-null int64 10 ALCOHOL CONSUMING 309 non-null int64 11 COUGHING 309 non-null int64 12 SHORTNESS OF BREATH 309 non-null int64 13 SWALLOWING DIFFICULTY 309 non-null int64 14 CHEST PAIN 309 non-null int64 15 LUNG_CANCER 309 non-null object dtypes: int64(14), object(2) memory usage: 38.8+ KB
data.describe()
| AGE | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL CONSUMING | COUGHING | SHORTNESS OF BREATH | SWALLOWING DIFFICULTY | CHEST PAIN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 | 309.000000 |
| mean | 62.673139 | 1.563107 | 1.569579 | 1.498382 | 1.501618 | 1.504854 | 1.673139 | 1.556634 | 1.556634 | 1.556634 | 1.579288 | 1.640777 | 1.469256 | 1.556634 |
| std | 8.210301 | 0.496806 | 0.495938 | 0.500808 | 0.500808 | 0.500787 | 0.469827 | 0.497588 | 0.497588 | 0.497588 | 0.494474 | 0.480551 | 0.499863 | 0.497588 |
| min | 21.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 57.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 50% | 62.000000 | 2.000000 | 2.000000 | 1.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 1.000000 | 2.000000 |
| 75% | 69.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 |
| max | 87.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 |
data.rename(columns = {'GENDER' : 'GENDER', 'AGE' : 'AGE', 'SMOKING' : 'SMOKING', 'YELLOW_FINGERS' : 'YELLOW_FINGERS', 'ANXIETY' : 'ANXIETY',
'PEER_PRESSURE' : 'PEER_PRESSURE', 'CHRONIC DISEASE' : 'CHRONIC_DISEASE', 'FATIGUE ' : 'FATIGUE', 'ALLERGY ' : 'ALLERGY', 'WHEEZING' : 'WHEEZING',
'ALCOHOL CONSUMING' : 'ALCOHOL_CONSUMING', 'COUGHING' : 'COUGHING', 'SHORTNESS OF BREATH' : 'SHORTNESS_OF_BREATH',
'SWALLOWING DIFFICULTY' : 'SWALLOWING_DIFFICULTY', 'CHEST PAIN' : 'CHEST_PAIN', 'LUNG_CANCER': 'LUNG_CANCER'}, inplace = True)
data.columns
Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
'PEER_PRESSURE', 'CHRONIC_DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
'ALCOHOL_CONSUMING', 'COUGHING', 'SHORTNESS_OF_BREATH',
'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER'],
dtype='object')
for i in data.columns:
print(i,data[i].unique())
GENDER ['M' 'F'] AGE [69 74 59 63 75 52 51 68 53 61 72 60 58 48 57 44 64 21 65 55 62 56 67 77 70 54 49 73 47 71 66 76 78 81 79 38 39 87 46] SMOKING [1 2] YELLOW_FINGERS [2 1] ANXIETY [2 1] PEER_PRESSURE [1 2] CHRONIC_DISEASE [1 2] FATIGUE [2 1] ALLERGY [1 2] WHEEZING [2 1] ALCOHOL_CONSUMING [2 1] COUGHING [2 1] SHORTNESS_OF_BREATH [2 1] SWALLOWING_DIFFICULTY [2 1] CHEST_PAIN [2 1] LUNG_CANCER ['YES' 'NO']
# 2 datasets
No_Cancer= data[data['LUNG_CANCER'] == "NO"]
Cancer = data[data['LUNG_CANCER'] == "YES"]
print('No Lung Cancer Cases: {}'.format(len(No_Cancer)))
print('Lung Cancer Cases: {}'.format(len(Cancer)))
No Lung Cancer Cases: 39 Lung Cancer Cases: 270
def target_percent():
trace = go.Pie(labels = ['Has Lung Cancer','No Lung Cancer'], values = data['LUNG_CANCER'].value_counts(),
textfont=dict(size=20), opacity = 0.8,
marker=dict(colors=['green', 'red'],
line=dict(color='#000000', width=1.5)))
layout = dict(title = 'Distribution of Result variable')
fig = dict(data = [trace], layout=layout)
py.iplot(fig)
target_percent()
The above graph shows that the data is biased towards datapoints having outcome value as YES where it means that lung cancer was present actually.
Therefore, the data is unbalanced. The number of no-lung-cancer patients is 39 and the number of cancer patients is 270
df1 = data
df1 = df1.replace(to_replace = 2 ,value = "YES" )
df1 = df1.replace(to_replace = 1 ,value = "NO" )
df1.head()
| GENDER | AGE | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC_DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL_CONSUMING | COUGHING | SHORTNESS_OF_BREATH | SWALLOWING_DIFFICULTY | CHEST_PAIN | LUNG_CANCER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M | 69 | NO | YES | YES | NO | NO | YES | NO | YES | YES | YES | YES | YES | YES | YES |
| 1 | M | 74 | YES | NO | NO | NO | YES | YES | YES | NO | NO | NO | YES | YES | YES | YES |
| 2 | F | 59 | NO | NO | NO | YES | NO | YES | NO | YES | NO | YES | YES | NO | YES | NO |
| 3 | M | 63 | YES | YES | YES | NO | NO | NO | NO | NO | YES | NO | NO | YES | YES | NO |
| 4 | F | 63 | NO | YES | NO | NO | NO | NO | NO | YES | NO | YES | YES | NO | NO | NO |
df1.to_csv(r'dataset1.csv',index=False)
import plotly.express as px
for i in df1.columns:
fig = px.histogram(df1, x=i)
fig.update_layout(bargap=0.2,width=500, height=300)
fig.show()
encoding = {"GENDER": {"M": 1, "F": 0},"LUNG_CANCER": {"YES": 1, "NO": 0}}
df= data.replace(encoding)
df.head()
| GENDER | AGE | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC_DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL_CONSUMING | COUGHING | SHORTNESS_OF_BREATH | SWALLOWING_DIFFICULTY | CHEST_PAIN | LUNG_CANCER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 69 | 1 | 2 | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 1 |
| 1 | 1 | 74 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 1 |
| 2 | 0 | 59 | 1 | 1 | 1 | 2 | 1 | 2 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 0 |
| 3 | 1 | 63 | 2 | 2 | 2 | 1 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 2 | 2 | 0 |
| 4 | 0 | 63 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 1 | 0 |
df.to_csv(r'dataset2.csv',index=False)
# Correlation matrix
corrmat = df.corr()
fig = go.Figure(data = go.Heatmap( z = corrmat.values, x = list(corrmat.columns),y = list(corrmat.index),colorscale = 'Viridis',showscale=False))
fig.update_layout(title = 'Correlation', width=900,height=800)
fig.show()
#Correlation with output variable
cor_target = abs(corrmat["LUNG_CANCER"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.05]
relevant_features
GENDER 0.067254 AGE 0.089465 SMOKING 0.058179 YELLOW_FINGERS 0.181339 ANXIETY 0.144947 PEER_PRESSURE 0.186388 CHRONIC_DISEASE 0.110891 FATIGUE 0.150673 ALLERGY 0.327766 WHEEZING 0.249300 ALCOHOL_CONSUMING 0.288533 COUGHING 0.248570 SHORTNESS_OF_BREATH 0.060738 SWALLOWING_DIFFICULTY 0.259730 CHEST_PAIN 0.190451 LUNG_CANCER 1.000000 Name: LUNG_CANCER, dtype: float64
for i in data.columns:
if i != "AGE":
fig = px.histogram(data, x='AGE',color=i)
fig.update_layout(bargap=0.2,width=900, height=500)
fig.show()
data.dtypes
GENDER object AGE int64 SMOKING int64 YELLOW_FINGERS int64 ANXIETY int64 PEER_PRESSURE int64 CHRONIC_DISEASE int64 FATIGUE int64 ALLERGY int64 WHEEZING int64 ALCOHOL_CONSUMING int64 COUGHING int64 SHORTNESS_OF_BREATH int64 SWALLOWING_DIFFICULTY int64 CHEST_PAIN int64 LUNG_CANCER object dtype: object
df1 = df.groupby('AGE').agg({ 'GENDER' : 'count', 'SMOKING' : 'sum', 'YELLOW_FINGERS' : 'sum', 'ANXIETY' : 'sum',
'PEER_PRESSURE' : 'sum', 'CHRONIC_DISEASE' : 'sum', 'FATIGUE' : 'sum', 'ALLERGY' : 'sum', 'WHEEZING' : 'sum',
'ALCOHOL_CONSUMING' : 'sum', 'COUGHING' : 'sum', 'SHORTNESS_OF_BREATH' : 'sum',
'SWALLOWING_DIFFICULTY' : 'sum', 'CHEST_PAIN' : 'sum', 'LUNG_CANCER' : 'sum'})
df1
| GENDER | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC_DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL_CONSUMING | COUGHING | SHORTNESS_OF_BREATH | SWALLOWING_DIFFICULTY | CHEST_PAIN | LUNG_CANCER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AGE | |||||||||||||||
| 21 | 1 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 0 |
| 38 | 1 | 1 | 2 | 1 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 1 | 2 | 1 |
| 39 | 1 | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 1 | 2 | 1 |
| 44 | 2 | 3 | 4 | 3 | 4 | 4 | 4 | 2 | 3 | 2 | 2 | 4 | 4 | 3 | 2 |
| 46 | 1 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 0 |
| 47 | 4 | 7 | 8 | 4 | 7 | 8 | 7 | 6 | 7 | 4 | 6 | 7 | 5 | 6 | 3 |
| 48 | 2 | 3 | 3 | 3 | 3 | 4 | 4 | 4 | 3 | 3 | 4 | 4 | 4 | 3 | 2 |
| 49 | 3 | 4 | 4 | 4 | 5 | 5 | 4 | 5 | 4 | 5 | 6 | 5 | 4 | 4 | 3 |
| 51 | 8 | 14 | 15 | 13 | 13 | 11 | 15 | 12 | 11 | 11 | 11 | 15 | 12 | 11 | 8 |
| 52 | 4 | 7 | 5 | 5 | 5 | 6 | 7 | 7 | 8 | 8 | 6 | 6 | 6 | 7 | 4 |
| 53 | 4 | 6 | 7 | 7 | 6 | 8 | 6 | 7 | 5 | 7 | 5 | 7 | 6 | 7 | 4 |
| 54 | 8 | 15 | 14 | 13 | 13 | 13 | 13 | 11 | 15 | 13 | 13 | 12 | 15 | 13 | 8 |
| 55 | 11 | 19 | 14 | 14 | 16 | 17 | 22 | 17 | 18 | 17 | 15 | 16 | 14 | 17 | 8 |
| 56 | 19 | 28 | 28 | 26 | 28 | 29 | 32 | 32 | 29 | 31 | 29 | 34 | 26 | 33 | 17 |
| 57 | 9 | 14 | 14 | 13 | 13 | 14 | 13 | 12 | 11 | 14 | 13 | 14 | 12 | 14 | 6 |
| 58 | 13 | 23 | 18 | 18 | 18 | 15 | 20 | 20 | 21 | 23 | 22 | 19 | 19 | 20 | 12 |
| 59 | 15 | 21 | 24 | 24 | 25 | 23 | 24 | 22 | 25 | 21 | 23 | 24 | 24 | 23 | 11 |
| 60 | 17 | 26 | 26 | 26 | 22 | 25 | 26 | 24 | 24 | 26 | 25 | 27 | 24 | 27 | 14 |
| 61 | 16 | 26 | 26 | 27 | 24 | 23 | 28 | 25 | 21 | 22 | 24 | 29 | 25 | 23 | 14 |
| 62 | 18 | 28 | 27 | 28 | 27 | 27 | 29 | 28 | 29 | 30 | 25 | 26 | 30 | 30 | 17 |
| 63 | 19 | 27 | 31 | 26 | 27 | 27 | 32 | 29 | 31 | 29 | 30 | 33 | 26 | 28 | 14 |
| 64 | 20 | 32 | 33 | 31 | 32 | 29 | 29 | 34 | 30 | 33 | 32 | 28 | 29 | 33 | 18 |
| 65 | 7 | 12 | 14 | 14 | 13 | 9 | 12 | 10 | 11 | 9 | 11 | 13 | 13 | 8 | 7 |
| 66 | 4 | 7 | 7 | 7 | 5 | 6 | 7 | 6 | 7 | 7 | 8 | 8 | 5 | 5 | 4 |
| 67 | 13 | 20 | 21 | 21 | 20 | 17 | 23 | 17 | 19 | 18 | 22 | 23 | 18 | 20 | 12 |
| 68 | 9 | 15 | 11 | 16 | 14 | 13 | 15 | 13 | 12 | 14 | 13 | 13 | 12 | 13 | 6 |
| 69 | 11 | 15 | 15 | 18 | 15 | 13 | 18 | 15 | 18 | 18 | 18 | 18 | 17 | 16 | 8 |
| 70 | 15 | 23 | 20 | 17 | 22 | 23 | 26 | 27 | 22 | 27 | 27 | 25 | 18 | 24 | 14 |
| 71 | 10 | 16 | 17 | 16 | 11 | 19 | 15 | 17 | 17 | 16 | 18 | 16 | 15 | 17 | 9 |
| 72 | 10 | 15 | 18 | 15 | 16 | 15 | 19 | 15 | 18 | 16 | 19 | 18 | 14 | 16 | 10 |
| 73 | 4 | 7 | 6 | 6 | 6 | 6 | 6 | 7 | 5 | 7 | 7 | 6 | 7 | 7 | 4 |
| 74 | 6 | 9 | 10 | 8 | 9 | 10 | 12 | 10 | 11 | 10 | 9 | 10 | 9 | 10 | 6 |
| 75 | 5 | 7 | 9 | 7 | 8 | 9 | 7 | 9 | 8 | 7 | 9 | 7 | 6 | 7 | 5 |
| 76 | 4 | 7 | 6 | 6 | 6 | 5 | 8 | 8 | 7 | 6 | 7 | 8 | 6 | 8 | 4 |
| 77 | 9 | 13 | 15 | 14 | 16 | 15 | 16 | 14 | 16 | 14 | 14 | 13 | 14 | 12 | 9 |
| 78 | 2 | 3 | 4 | 3 | 4 | 3 | 4 | 2 | 4 | 2 | 3 | 4 | 4 | 3 | 2 |
| 79 | 1 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1 |
| 81 | 2 | 2 | 3 | 3 | 4 | 4 | 3 | 4 | 3 | 3 | 4 | 4 | 3 | 3 | 2 |
| 87 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 0 |
df2 = df.groupby('GENDER').agg({ 'AGE' : 'count', 'SMOKING' : 'sum', 'YELLOW_FINGERS' : 'sum', 'ANXIETY' : 'sum',
'PEER_PRESSURE' : 'sum', 'CHRONIC_DISEASE' : 'sum', 'FATIGUE' : 'sum', 'ALLERGY' : 'sum', 'WHEEZING' : 'sum',
'ALCOHOL_CONSUMING' : 'sum', 'COUGHING' : 'sum', 'SHORTNESS_OF_BREATH' : 'sum',
'SWALLOWING_DIFFICULTY' : 'sum', 'CHEST_PAIN' : 'sum', 'LUNG_CANCER' : 'sum'})
df2
| AGE | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC_DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL_CONSUMING | COUGHING | SHORTNESS_OF_BREATH | SWALLOWING_DIFFICULTY | CHEST_PAIN | LUNG_CANCER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| GENDER | |||||||||||||||
| 0 | 147 | 227 | 247 | 232 | 242 | 237 | 252 | 217 | 218 | 194 | 222 | 246 | 222 | 201 | 125 |
| 1 | 162 | 256 | 238 | 231 | 222 | 228 | 265 | 264 | 263 | 287 | 266 | 261 | 232 | 280 | 145 |
Most of the males having lung cancer
df3 = df.groupby('SMOKING').agg({ 'GENDER' : 'sum','AGE' : 'count', 'SMOKING' : 'sum', 'YELLOW_FINGERS' : 'sum', 'ANXIETY' : 'sum',
'PEER_PRESSURE' : 'sum', 'CHRONIC_DISEASE' : 'sum', 'FATIGUE' : 'sum', 'ALLERGY' : 'sum', 'WHEEZING' : 'sum',
'ALCOHOL_CONSUMING' : 'sum', 'COUGHING' : 'sum', 'SHORTNESS_OF_BREATH' : 'sum',
'SWALLOWING_DIFFICULTY' : 'sum', 'CHEST_PAIN' : 'sum', 'LUNG_CANCER' : 'sum'})
df3
| GENDER | AGE | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC_DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL_CONSUMING | COUGHING | SHORTNESS_OF_BREATH | SWALLOWING_DIFFICULTY | CHEST_PAIN | LUNG_CANCER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SMOKING | ||||||||||||||||
| 1 | 68 | 135 | 135 | 213 | 190 | 206 | 214 | 228 | 210 | 220 | 214 | 223 | 217 | 196 | 201 | 115 |
| 2 | 94 | 174 | 348 | 272 | 273 | 258 | 251 | 289 | 271 | 261 | 267 | 265 | 290 | 258 | 280 | 155 |
smoking people have high problems and people who smoke got cancer more
df4 = df.groupby('YELLOW_FINGERS').agg({ 'GENDER' : 'sum','AGE' : 'count', 'SMOKING' : 'sum', 'YELLOW_FINGERS' : 'sum', 'ANXIETY' : 'sum',
'PEER_PRESSURE' : 'sum', 'CHRONIC_DISEASE' : 'sum', 'FATIGUE' : 'sum', 'ALLERGY' : 'sum', 'WHEEZING' : 'sum',
'ALCOHOL_CONSUMING' : 'sum', 'COUGHING' : 'sum', 'SHORTNESS_OF_BREATH' : 'sum',
'SWALLOWING_DIFFICULTY' : 'sum', 'CHEST_PAIN' : 'sum', 'LUNG_CANCER' : 'sum'})
df4
| GENDER | AGE | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC_DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL_CONSUMING | COUGHING | SHORTNESS_OF_BREATH | SWALLOWING_DIFFICULTY | CHEST_PAIN | LUNG_CANCER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| YELLOW_FINGERS | ||||||||||||||||
| 1 | 86 | 133 | 209 | 133 | 156 | 175 | 197 | 231 | 218 | 213 | 229 | 211 | 226 | 169 | 215 | 107 |
| 2 | 76 | 176 | 274 | 352 | 307 | 289 | 268 | 286 | 263 | 268 | 252 | 277 | 281 | 285 | 266 | 163 |
mostly females have yellow fingers so any person with yellow fingers have high probability of getting lung cancer
df5 = df.groupby('ANXIETY').agg({ 'GENDER' : 'sum','AGE' : 'count', 'SMOKING' : 'sum', 'YELLOW_FINGERS' : 'sum', 'ANXIETY' : 'sum',
'PEER_PRESSURE' : 'sum', 'CHRONIC_DISEASE' : 'sum', 'FATIGUE' : 'sum', 'ALLERGY' : 'sum', 'WHEEZING' : 'sum',
'ALCOHOL_CONSUMING' : 'sum', 'COUGHING' : 'sum', 'SHORTNESS_OF_BREATH' : 'sum',
'SWALLOWING_DIFFICULTY' : 'sum', 'CHEST_PAIN' : 'sum', 'LUNG_CANCER' : 'sum'})
df5
| GENDER | AGE | SMOKING | YELLOW_FINGERS | ANXIETY | PEER_PRESSURE | CHRONIC_DISEASE | FATIGUE | ALLERGY | WHEEZING | ALCOHOL_CONSUMING | COUGHING | SHORTNESS_OF_BREATH | SWALLOWING_DIFFICULTY | CHEST_PAIN | LUNG_CANCER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ANXIETY | ||||||||||||||||
| 1 | 93 | 155 | 230 | 200 | 155 | 216 | 234 | 273 | 254 | 256 | 254 | 262 | 265 | 190 | 250 | 128 |
| 2 | 69 | 154 | 253 | 285 | 308 | 248 | 231 | 244 | 227 | 225 | 227 | 226 | 242 | 264 | 231 | 142 |
list(df5.columns)
['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE', 'CHRONIC_DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING', 'ALCOHOL_CONSUMING', 'COUGHING', 'SHORTNESS_OF_BREATH', 'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER']
data.columns
Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
'PEER_PRESSURE', 'CHRONIC_DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
'ALCOHOL_CONSUMING', 'COUGHING', 'SHORTNESS_OF_BREATH',
'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER'],
dtype='object')
def corrMat2(df,target='LUNG_CANCER',figsize=(10,0.5),ret_id=False):
corr_mat = df.corr().round(2);shape = corr_mat.shape[0]
corr_mat = corr_mat.transpose()
corr = corr_mat.loc[:, df.columns == target].transpose().copy()
if(ret_id is False):
f, ax = plt.subplots(figsize=figsize)
sns.heatmap(corr,vmin=-0.3,vmax=0.3,center=0,
cmap='Blues',square=False,lw=2,annot=True,cbar=False)
plt.title(f'Feature Correlation to {target}')
if(ret_id):
return corr
corrMat2(df)
#!pip install imblearn
X = df.drop('LUNG_CANCER',axis = 1)
Y = df['LUNG_CANCER']
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=24)
X,Y = sm.fit_resample(X, Y)
print(X.shape)
print(Y.shape)
(540, 15) (540,)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=101)
print(X_train.shape)
print(X_test.shape)
(432, 15) (108, 15)
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(X_train.values,Y_train.values)
pred = knn.predict(X_test.values)
pred
array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0],
dtype=int64)
knn_acc = accuracy_score(Y_test.values,knn.predict(X_test.values))
print("Train Set Accuracy:"+str(accuracy_score(Y_train.values,knn.predict(X_train.values))*100))
print("Test Set Accuracy:"+str(accuracy_score(Y_test.values,knn.predict(X_test.values))*100))
Train Set Accuracy:94.9074074074074 Test Set Accuracy:91.66666666666666
print(confusion_matrix(Y_test.values,pred))
print(classification_report(Y_test.values,pred))
[[49 2]
[ 7 50]]
precision recall f1-score support
0 0.88 0.96 0.92 51
1 0.96 0.88 0.92 57
accuracy 0.92 108
macro avg 0.92 0.92 0.92 108
weighted avg 0.92 0.92 0.92 108
rfc= RandomForestClassifier(n_estimators =40, random_state = 0)
rfc.fit(X_train.values,Y_train.values)
y_pred = rfc.predict(X_test.values)
rfc_acc = accuracy_score(Y_test.values,rfc.predict(X_test.values))
print("Train Set Accuracy:"+str(accuracy_score(Y_train.values,rfc.predict(X_train.values))*100))
print("Test Set Accuracy:"+str(accuracy_score(Y_test.values,rfc.predict(X_test.values))*100))
Train Set Accuracy:100.0 Test Set Accuracy:98.14814814814815
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train.values, Y_train.values)
preds = svc.predict(X_test.values)
svm_acc= accuracy_score(Y_test,svc.predict(X_test.values))
print("Train Set Accuracy:"+str(accuracy_score(Y_train.values,svc.predict(X_train.values))*100))
print("Test Set Accuracy:"+str(accuracy_score(Y_test.values,svc.predict(X_test.values))*100))
Train Set Accuracy:96.52777777777779 Test Set Accuracy:96.29629629629629
print(classification_report(Y_test.values,pred))
print(confusion_matrix(Y_test.values,pred))
precision recall f1-score support
0 0.88 0.96 0.92 51
1 0.96 0.88 0.92 57
accuracy 0.92 108
macro avg 0.92 0.92 0.92 108
weighted avg 0.92 0.92 0.92 108
[[49 2]
[ 7 50]]
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train.values, Y_train.values)
preds = dtc.predict(X_test.values)
dtc_acc= accuracy_score(Y_test.values,dtc.predict(X_test.values))
print("Train Set Accuracy:"+str(accuracy_score(Y_train.values,dtc.predict(X_train.values))*100))
print("Test Set Accuracy:"+str(accuracy_score(Y_test.values,dtc.predict(X_test.values))*100))
Train Set Accuracy:100.0 Test Set Accuracy:93.51851851851852
models = pd.DataFrame({
'Model': ['KNN', 'RFC', 'SVC', 'DTC'],
'Score': [ knn_acc, rfc_acc, svm_acc,dtc_acc]
})
models.sort_values(by = 'Score', ascending = False)
| Model | Score | |
|---|---|---|
| 1 | RFC | 0.981481 |
| 2 | SVC | 0.962963 |
| 3 | DTC | 0.935185 |
| 0 | KNN | 0.916667 |
models.to_csv(r'models.csv',index=False)
fig = px.bar(models, x='Model', y='Score')
fig.update_layout(width=500,height=400)
fig.show()
# pickling the model
import pickle
pickle_out = open("output.pkl", "wb")
pickle.dump(rfc, pickle_out)
pickle_out.close()
pickle_in = open('output.pkl', 'rb')
rfc = pickle.load(pickle_in)
prediction = rfc.predict([[0,63,1,2,1,1,1,1,1,2,1,2,2,1,1]])
print(prediction)
[0]
prediction = rfc.predict([[0,59,1,1,1,2,1,2,1,2,1,2,2,1,2]])
print(prediction)
[1]